We will compare some classifiers on the “Toxic” column.

Load libraries

library(tidyverse)
package 㤼㸱tidyverse㤼㸲 was built under R version 4.0.5Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
-- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5     v purrr   0.3.4
v tibble  3.1.3     v dplyr   1.0.7
v tidyr   1.1.3     v stringr 1.4.0
v readr   2.0.1     v forcats 0.5.1
package 㤼㸱ggplot2㤼㸲 was built under R version 4.0.5package 㤼㸱tibble㤼㸲 was built under R version 4.0.5package 㤼㸱tidyr㤼㸲 was built under R version 4.0.5package 㤼㸱purrr㤼㸲 was built under R version 4.0.5package 㤼㸱dplyr㤼㸲 was built under R version 4.0.5package 㤼㸱stringr㤼㸲 was built under R version 4.0.5package 㤼㸱forcats㤼㸲 was built under R version 4.0.5-- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(tictoc)
package 㤼㸱tictoc㤼㸲 was built under R version 4.0.5
library(caret)
package 㤼㸱caret㤼㸲 was built under R version 4.0.5Loading required package: lattice
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: 㤼㸱caret㤼㸲

The following object is masked from 㤼㸱package:purrr㤼㸲:

    lift
library(MASS)

Attaching package: 㤼㸱MASS㤼㸲

The following object is masked from 㤼㸱package:dplyr㤼㸲:

    select
source("./parameters.R")

Open the Bag of Word with labels

# Open the bag of words
fileName = "bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.5_from_1408_to_1247_rm0.csv"
df = read_csv(fileName, col_types=col_types_df)
df = df[,-c(2,3,5:9)]
# During tests, we can work on a sample
sampled = FALSE
if (sampled == TRUE) {
  set.seed(42)
  max = nrow(df)
  sampled = round(max/10)
  df = df[sample(max, sampled), ]
}
# show the data set
df

Splitting the data

# Split between train and test
df_train = df[df[1] == 1,-1]
df_test  = df[df[1] == 2,-1]

# Split the test set between features and labels
X_train = df_train[,-1]
Y_train = as.factor(df_train$df_toxic)

# Split the train set between features and labels
X_test = df_test[,-1]
Y_test = as.factor(df_test$df_toxic)

Have a look

X_train
as.data.frame(Y_train) 
X_test
as.data.frame(Y_test)

Train the model

tic("Training: ")

f <- lda(X_train,Y_train)

toc(log = TRUE)
Training: : 227.22 sec elapsed

This takes some time…

What’s in f ?


Restarting R session...

Let’s do the inference on the test set now.

# Do the inference on the test set
tic("Inference: ")
predictions = predict(f,X_test)
toc(log = TRUE)
Inference: : 5.75 sec elapsed
Y_pred = as.data.frame(predictions)
Y_pred

What does the confusion matrix gives us?

writeLines("\n")
mat = confusionMatrix(Y_test, as.factor(Y_pred$class))
mat
Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 3714 1916
         1 1112 4643
                                          
               Accuracy : 0.734           
                 95% CI : (0.7258, 0.7421)
    No Information Rate : 0.5761          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.4672          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.7696          
            Specificity : 0.7079          
         Pos Pred Value : 0.6597          
         Neg Pred Value : 0.8068          
             Prevalence : 0.4239          
         Detection Rate : 0.3262          
   Detection Prevalence : 0.4945          
      Balanced Accuracy : 0.7387          
                                          
       'Positive' Class : 0               
                                          
print("END: all the notebook ran.")
[1] "END: all the notebook ran."
Sys.time()
[1] "2021-08-24 13:31:48 CEST"
writeLines(paste0("File: ", fileName))
File: bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.5_from_1408_to_1247_rm0.csv
writeLines(paste0("Accuracy: ", mat$overall[1]))
Accuracy: 0.734036012296882
writeLines(paste0(tic.log(format = TRUE)[1][1]))
Training: : 179.6 sec elapsed
writeLines(paste0(tic.log(format = TRUE)[2][1]))
Inference: : 6.01 sec elapsed

Results

bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.3_from_1408_to_1110_rm0.csv

Accuracy: 0.735107618111994 Training: : 0.1 sec elapsed Inference: : 0.11 sec elapsed

Reference Prediction 0 1 0 3719 1847 1 1119 4512

bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.3_from_1408_to_1110.csv

Accuracy: 0.725739848589126 Training: : 179.6 sec elapsed Inference: : 6.01 sec elapsed

Reference Prediction 0 1 0 3708 2075 1 1113 4728

bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.5_from_1408_to_1247_rm0.csv

Accuracy: 0.734036012296882 Training: : 179.6 sec elapsed Inference: : 6.01 sec elapsed

Reference Prediction 0 1 0 3714 1916 1 1112 4643

LS0tDQp0aXRsZTogIkxEQSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KV2Ugd2lsbCBjb21wYXJlIHNvbWUgY2xhc3NpZmllcnMgb24gdGhlICJUb3hpYyIgY29sdW1uLg0KDQpMb2FkIGxpYnJhcmllcw0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeSh0aWN0b2MpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShNQVNTKQ0Kc291cmNlKCIuL3BhcmFtZXRlcnMuUiIpDQpgYGANCg0KDQojIE9wZW4gdGhlIEJhZyBvZiBXb3JkIHdpdGggbGFiZWxzDQoNCmBgYHtyfQ0KIyBPcGVuIHRoZSBiYWcgb2Ygd29yZHMNCmZpbGVOYW1lID0gImJvd190ZmlkZl9fbWluX3dvcmRzXzEwMF8yZ3JhbXNfMTAwMF9fc2FtcGxpbmdfYmFsYW5jZWRfX2Nvcl9jdXRfMC41X2Zyb21fMTQwOF90b18xMjQ3X3JtMC5jc3YiDQpkZiA9IHJlYWRfY3N2KGZpbGVOYW1lLCBjb2xfdHlwZXM9Y29sX3R5cGVzX2RmKQ0KZGYgPSBkZlssLWMoMiwzLDU6OSldDQojIER1cmluZyB0ZXN0cywgd2UgY2FuIHdvcmsgb24gYSBzYW1wbGUNCnNhbXBsZWQgPSBGQUxTRQ0KaWYgKHNhbXBsZWQgPT0gVFJVRSkgew0KICBzZXQuc2VlZCg0MikNCiAgbWF4ID0gbnJvdyhkZikNCiAgc2FtcGxlZCA9IHJvdW5kKG1heC8xMCkNCiAgZGYgPSBkZltzYW1wbGUobWF4LCBzYW1wbGVkKSwgXQ0KfQ0KIyBzaG93IHRoZSBkYXRhIHNldA0KZGYNCmBgYA0KDQoNCiMgU3BsaXR0aW5nIHRoZSBkYXRhDQoNCmBgYHtyfQ0KIyBTcGxpdCBiZXR3ZWVuIHRyYWluIGFuZCB0ZXN0DQpkZl90cmFpbiA9IGRmW2RmWzFdID09IDEsLTFdDQpkZl90ZXN0ICA9IGRmW2RmWzFdID09IDIsLTFdDQoNCiMgU3BsaXQgdGhlIHRlc3Qgc2V0IGJldHdlZW4gZmVhdHVyZXMgYW5kIGxhYmVscw0KWF90cmFpbiA9IGRmX3RyYWluWywtMV0NCllfdHJhaW4gPSBhcy5mYWN0b3IoZGZfdHJhaW4kZGZfdG94aWMpDQoNCiMgU3BsaXQgdGhlIHRyYWluIHNldCBiZXR3ZWVuIGZlYXR1cmVzIGFuZCBsYWJlbHMNClhfdGVzdCA9IGRmX3Rlc3RbLC0xXQ0KWV90ZXN0ID0gYXMuZmFjdG9yKGRmX3Rlc3QkZGZfdG94aWMpDQpgYGANCg0KIyBIYXZlIGEgbG9vaw0KDQpgYGB7cn0NClhfdHJhaW4NCmFzLmRhdGEuZnJhbWUoWV90cmFpbikgDQpYX3Rlc3QNCmFzLmRhdGEuZnJhbWUoWV90ZXN0KQ0KYGBgDQoNCg0KIyBUcmFpbiB0aGUgbW9kZWwNCg0KYGBge3J9DQp0aWMoIlRyYWluaW5nOiAiKQ0KDQpmIDwtIGxkYShYX3RyYWluLFlfdHJhaW4pDQoNCnRvYyhsb2cgPSBUUlVFKQ0KYGBgDQpUaGlzIHRha2VzIHNvbWUgdGltZS4uLg0KDQpXaGF0J3MgaW4gZiA/DQoNCmBgYHtyfQ0KI2YNCmBgYA0KDQoNCkxldCdzIGRvIHRoZSBpbmZlcmVuY2Ugb24gdGhlIHRlc3Qgc2V0IG5vdy4NCg0KYGBge3J9DQojIERvIHRoZSBpbmZlcmVuY2Ugb24gdGhlIHRlc3Qgc2V0DQp0aWMoIkluZmVyZW5jZTogIikNCnByZWRpY3Rpb25zID0gcHJlZGljdChmLFhfdGVzdCkNCnRvYyhsb2cgPSBUUlVFKQ0KWV9wcmVkID0gYXMuZGF0YS5mcmFtZShwcmVkaWN0aW9ucykNCllfcHJlZA0KYGBgDQoNCg0KV2hhdCBkb2VzIHRoZSBjb25mdXNpb24gbWF0cml4IGdpdmVzIHVzPw0KDQpgYGB7cn0NCndyaXRlTGluZXMoIlxuIikNCm1hdCA9IGNvbmZ1c2lvbk1hdHJpeChZX3Rlc3QsIGFzLmZhY3RvcihZX3ByZWQkY2xhc3MpKQ0KbWF0DQpgYGANCg0KYGBge3J9DQpwcmludCgiRU5EOiBhbGwgdGhlIG5vdGVib29rIHJhbi4iKQ0KU3lzLnRpbWUoKQ0KDQp3cml0ZUxpbmVzKHBhc3RlMCgiRmlsZTogIiwgZmlsZU5hbWUpKQ0Kd3JpdGVMaW5lcyhwYXN0ZTAoIkFjY3VyYWN5OiAiLCBtYXQkb3ZlcmFsbFsxXSkpDQp3cml0ZUxpbmVzKHBhc3RlMCh0aWMubG9nKGZvcm1hdCA9IFRSVUUpWzFdWzFdKSkNCndyaXRlTGluZXMocGFzdGUwKHRpYy5sb2coZm9ybWF0ID0gVFJVRSlbMl1bMV0pKQ0KYGBgDQoNCiMjIFJlc3VsdHMNCg0KIyMjIGJvd190ZmlkZl9fbWluX3dvcmRzXzEwMF8yZ3JhbXNfMTAwMF9fc2FtcGxpbmdfYmFsYW5jZWRfX2Nvcl9jdXRfMC4zX2Zyb21fMTQwOF90b18xMTEwX3JtMC5jc3YNCg0KQWNjdXJhY3k6IDAuNzM1MTA3NjE4MTExOTk0DQpUcmFpbmluZzogOiAwLjEgc2VjIGVsYXBzZWQNCkluZmVyZW5jZTogOiAwLjExIHNlYyBlbGFwc2VkDQoNClJlZmVyZW5jZQ0KUHJlZGljdGlvbiAgICAwICAgIDENCiAgICAgICAgIDAgMzcxOSAxODQ3DQogICAgICAgICAxIDExMTkgNDUxMg0KICAgICAgICAgDQojIyMgYm93X3RmaWRmX19taW5fd29yZHNfMTAwXzJncmFtc18xMDAwX19zYW1wbGluZ19iYWxhbmNlZF9fY29yX2N1dF8wLjNfZnJvbV8xNDA4X3RvXzExMTAuY3N2DQoNCkFjY3VyYWN5OiAwLjcyNTczOTg0ODU4OTEyNg0KVHJhaW5pbmc6IDogMTc5LjYgc2VjIGVsYXBzZWQNCkluZmVyZW5jZTogOiA2LjAxIHNlYyBlbGFwc2VkDQoNClJlZmVyZW5jZQ0KUHJlZGljdGlvbiAgICAwICAgIDENCiAgICAgICAgIDAgMzcwOCAyMDc1DQogICAgICAgICAxIDExMTMgNDcyOA0KICAgICAgICAgDQogICAgICAgICANCiMjIyBib3dfdGZpZGZfX21pbl93b3Jkc18xMDBfMmdyYW1zXzEwMDBfX3NhbXBsaW5nX2JhbGFuY2VkX19jb3JfY3V0XzAuNV9mcm9tXzE0MDhfdG9fMTI0N19ybTAuY3N2DQoNCkFjY3VyYWN5OiAwLjczNDAzNjAxMjI5Njg4Mg0KVHJhaW5pbmc6IDogMTc5LjYgc2VjIGVsYXBzZWQNCkluZmVyZW5jZTogOiA2LjAxIHNlYyBlbGFwc2VkDQoNClJlZmVyZW5jZQ0KUHJlZGljdGlvbiAgICAwICAgIDENCiAgICAgICAgIDAgMzcxNCAxOTE2DQogICAgICAgICAxIDExMTIgNDY0Mw==